This notebook documents the bottom-up strategy experimentation to determine notebook similarity. It is based on the notion that it is easier to aggregate than to break down a 'black box.'
The biggest challenge is working with the AST structure. Because it is a tree, we need to merge leafs with their parents, working our way up.
There are two main goals:
In [1]:
# Necessary imports
import os
import time
from nbminer.notebook_miner import NotebookMiner
from nbminer.cells.cells import Cell
from nbminer.features.ast_features import ASTFeatures
from nbminer.stats.summary import Summary
from nbminer.stats.multiple_summary import MultipleSummary
from nbminer.features.featurize.ast_graph.ast_graph import *
In [2]:
people = os.listdir('../testbed/Final')
notebooks = []
for person in people:
person = os.path.join('../testbed/Final', person)
if os.path.isdir(person):
direc = os.listdir(person)
notebooks.extend([os.path.join(person, filename) for filename in direc if filename.endswith('.ipynb')])
notebook_objs = [NotebookMiner(file) for file in notebooks]
a = ASTFeatures(notebook_objs)
In [3]:
for i, nb in enumerate(a.nb_features):
a.nb_features[i] = nb.get_new_notebook()
In [ ]:
In [4]:
graphs = []
for nb in a.nb_features:
for cell in nb.get_all_cells():
graphs.append(cell.get_feature('graph'))
agr = ASTGraphReducer(graphs)
num_nodes = []
for g in agr.graphs:
num_nodes.append(g.graph_nodes())
print ('Total number of graphs:',agr.number_graphs())
print ('Total number of graphs with one node:',agr.number_single())
print ('Total number of nodes:',agr.count_nodes())
print (agr.count_nodes())
%matplotlib inline
import matplotlib.pyplot as plt
plt.hist(num_nodes, bins=30)
Out[4]:
In [5]:
cur_count = 0
new_count = 1
print (agr.count_nodes())
while cur_count != new_count:
cur_count = new_count
new_count = (agr.count_nodes())
agr.build_relations()
print (new_count)
In [6]:
num_nodes = []
for g in agr.graphs:
num_nodes.append(g.graph_nodes())
print ('Total number of graphs:',agr.number_graphs())
print ('Total number of graphs with one node:',agr.number_single())
print ('Total number of nodes:',agr.count_nodes())
print (agr.count_nodes())
%matplotlib inline
import matplotlib.pyplot as plt
plt.hist(num_nodes, bins=30)
Out[6]:
In [7]:
# Similarity between nb 0 and all other notebooks:
print (sorted([similarity[1][1] for similarity in a.notebook_jaccard_similarity(0)]))
In [8]:
# Maximum similarity
all_sims = []
max_sim = 0
max_val = None
for i in range(len(a.nb_features)):
for similarity in a.notebook_jaccard_similarity(i):
if similarity[1][1] > max_sim:
max_sim = similarity[1][1]
max_val = (i, similarity[0])
max_sim, max_val
Out[8]:
In [9]:
a.nb_features[2].notebook.filename
Out[9]:
In [10]:
a.nb_features[3].notebook.filename
Out[10]:
Now we're interested in what happened with this bottom up approach. What does the final thing look like? We can print out each graph and get a sense of what's happened, then we can look at some actual code, what it looks like in graph format, and what the black boxes it holds actually mean
In [21]:
for cell in a.nb_features[25].get_all_cells():
print (cell.get_feature('graph').get_nodes())
In [22]:
for cell in a.nb_features[39].get_all_cells():
print (cell.get_feature('graph').get_nodes())
In [13]:
cells = []
for nb in a.nb_features:
cells.extend([cell for cell in nb.get_all_cells()])
groups = []
cur_code = ''
cur_group = []
for cell in cells:
if cell.get_feature('original_code') == cur_code:
cur_group.append(cell)
else:
if len(cur_group) > 0:
groups.append(cur_group)
cur_group = []
cur_code = cell.get_feature('original_code')
In [14]:
group = 6
print ('*'*50)
print ('Black Boxes')
for cell in groups[group]:
print (cell.get_feature('graph').get_nodes())
print ('*'*50)
print ('Code')
print (groups[group][0].get_feature('original_code'))
print ('*'*50)
print ('Black Box meaning')
for cell in groups[group]:
n = (cell.get_feature('graph').get_nodes())
if len(n) == 1 and n[0][:5] == 'black':
print (agr.get_trace(n[0]))
In [31]:
print (agr.get_trace('black_box1288'))
In [24]:
for key in agr.names.keys():
if 'Call' in key:
print (key)
In [17]:
graph_sets = []
for nb in a.nb_features:
graph_set = []
for cell in nb.get_all_cells():
graph_set.append(cell.get_feature('graph'))
graph_sets.append(graph_set)
In [18]:
agc = ASTGraphCombiner(graph_sets)
In [19]:
print ('before',agc.count_graphs())
agc.reduce_graphs()
print ('after',agc.count_graphs())
print ('total_distinct',agc.count_distinct_nodes())
In [20]:
for graph in agc.graph_sets[0]:
print (graph.get_nodes())
In [ ]: